/***************************************************************************
Copyright (c) 2013-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/

#define unit_size 16
#define DISP32(ind,disp) (ind*unit_size*32+disp)
#define DISP16(ind,disp) (ind*unit_size*16+disp)
#define DISP8(ind,disp) (ind*unit_size*8+disp)
#define DISP4(ind,disp) (ind*unit_size*4+disp)
#define DISP2(ind,disp) (ind*unit_size*2+disp)
#define DISP1(ind,disp) (ind*unit_size+disp)
#define DISPX(disp)  (disp)
/*	HELPERS FOR SAVE	*/
/* {r0,i0} and {r1,i1} into  {r0,r1} {i0,i1} */


.macro LOAD_COUPLE_AS_RR_II  VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET 
#ifndef TRMMKERNEL 
  lxv	\VS_TEMP1,	DISPX(\LOFFSET)(\REG)
  lxv	\VS_TEMP2,	DISPX(\LOFFSET+16)(\REG)
  xxmrgld  \VS_OUT1,\VS_TEMP1,\VS_TEMP2
  xxmrghd  \VS_OUT2,\VS_TEMP1,\VS_TEMP2	
#endif	
.endm
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/


.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*real from 2 results*/
	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*imag from 2 results*/
.endm 
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/


.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 
	xxmrgld	\VSOUT1, \VSIN1,\VSIN2 /*  real*imag */
	xxmrghd	\VSOUT2, \VSIN1,\VSIN2 /*  imag*real*/
.endm
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/


.macro  AGGREGATE_REALS_IMAGES  VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
#if	defined(NN) || defined(NT) || defined(TN) || defined(TT) 
	xvsubdp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI  
#elif  defined(CN) || defined(CT) || defined(RN) || defined(RT) 
	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
	xvsubdp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
#elif  defined(NC) || defined(TC) || defined(NR) || defined(TR) 
	xvadddp  \VSINR_OUT1,\VSINR_OUT1,\VSINR
	xvsubdp  \VSINI_OUT2,\VSINI,\VSINI_OUT2  
#else	// CC || CR || RC || RR 
    /*we will assume {-alpha_r,-alpha_i} for this case */
    /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
	xvsubdp  \VSINR_OUT1,\VSINR,\VSINR_OUT1
    /*we will negate alpha image instead  instead to fix sign*/
	xvadddp  \VSINI_OUT2,\VSINI_OUT2,\VSINI 
#endif
.endm 
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */


.macro MULT_APLHA_PART1  VSINRR,VSINII,VSOUT1,VSOUT2
#ifndef TRMMKERNEL  
	xvmsubadp \VSOUT1,\VSINII, alpha_i
	xvmaddadp  \VSOUT2,\VSINRR, alpha_i
#else 
	xvmuldp \VSOUT1,\VSINII, alpha_i 
	xvmuldp  \VSOUT2,\VSINRR, alpha_i
#endif 
.endm
/*   {r0,r1} * {alpha_r,alpha_r} -  VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */


.macro MULT_APLHA_PART2  VSINRR,VSINII,VSOUT1,VSOUT2 
	xvmsubadp  \VSOUT1,\VSINRR, alpha_r
	xvmaddadp \VSOUT2,\VSINII, alpha_r
.endm
/* unpack to store 2{r,r} {i,i} into  {r,i} {r,i} (big endian because of stxv) */


.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 
	xxmrghd  \VSOUT1,\VSIN2,\VSIN1
	xxmrgld  \VSOUT2,\VSIN2,\VSIN1
.endm


.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
	stxv	\VSIN1,	DISPX(\LOFFSET)(\REG)
	stxv	\VSIN2,	DISPX(\LOFFSET+16)(\REG)
.endm


.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
  LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  LOAD_COUPLE_AS_RR_II	vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
  LOAD_COUPLE_AS_RR_II	vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes10,\VSRes12,vs44,vs45 
  AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
  MULT_APLHA_PART1	vs34,vs36, vs46,vs47
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes14,\VSRes16,\VSRes3,\VSRes4
  MULT_APLHA_PART1	vs38,vs40,vs48,vs49
  MULT_APLHA_PART2  vs34,vs36,vs46,vs47 
  AGGREGATE_REALS_IMAGES	vs42,vs43,vs44,vs45
  MULT_APLHA_PART2	vs38,vs40,vs48,vs49
  AGGREGATE_REALS_IMAGES	\VSRes1,\VSRes2,\VSRes3,\VSRes4	
  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
  MULT_APLHA_PART1	vs42,vs44, vs56,vs57
  UNPACK_FOR_STORE	vs48,vs49,vs35,vs37 
  MULT_APLHA_PART1	\VSRes1,\VSRes3, vs58,vs59
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
  MULT_APLHA_PART2	vs42,vs44,vs56,vs57
  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37 
  MULT_APLHA_PART2	\VSRes1,\VSRes3, vs58,vs59
  UNPACK_FOR_STORE	vs56,vs57,vs42,vs44
  UNPACK_FOR_STORE	vs58,vs59,\VSRes1,\VSRes3
  STORE_COUPLE	\BASE_REG,(\LOFFSET +64),vs42,vs44
  STORE_COUPLE	\BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
.endm


.macro SAVE4  VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37
  LOAD_COUPLE_AS_RR_II	vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes6,\VSRes8,vs40,vs41 
  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
  AGGREGATE_REALS_IMAGES	vs38,vs39,vs40,vs41  
  MULT_APLHA_PART1	vs34,vs36, vs46,vs47
  MULT_APLHA_PART1	vs38,vs40, vs48,vs49
  MULT_APLHA_PART2	vs34,vs36, vs46,vs47 
  MULT_APLHA_PART2	vs38,vs40,vs48,vs49
  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41
  UNPACK_FOR_STORE	vs48,vs49,vs35,vs37
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41
  STORE_COUPLE	\BASE_REG,(\LOFFSET+32),vs35,vs37
.endm


.macro SAVE2  VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
  LOAD_COUPLE_AS_RR_II	vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes4,vs36,vs37	
  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
  MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
  MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41	
  STORE_COUPLE	\BASE_REG,\LOFFSET,vs39,vs41  
.endm


.macro SAVE1  VSRes1,VSRes2,BASE_REG,LOFFSET
  RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
#ifndef TRMMKERNEL 
  lxv	vs50,	(\LOFFSET)(\BASE_REG) 
  xxmrgld  vs46,vs50,vs50
  xxmrghd  vs47,vs50,vs50	
#endif	
  RESULT_INTO_REALIMAG_IMAGREAL	\VSRes2,\VSRes2,vs36,vs37	
  AGGREGATE_REALS_IMAGES	vs34,vs35,vs36,vs37	
  MULT_APLHA_PART1	vs34,vs36, vs46,vs47	
  MULT_APLHA_PART2	vs34,vs36, vs46,vs47  
  UNPACK_FOR_STORE	vs46,vs47,vs39,vs41 
  xxmrghd  vs39,vs47,vs46	
  stxv	vs39,	(\LOFFSET)(\BASE_REG) 
.endm

/**********************************************************************************************
*

.macros for N=2 and M=8
**********************************************************************************************/

.macro  KERNEL2x8_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
        xxsetaccz 1
        xxsetaccz 2
        xxsetaccz 3
        xxsetaccz 4
        xxsetaccz 5
        xxsetaccz 6
        xxsetaccz 7
.endm


.macro KERNEL2x8_PRELOAD
	lxvp	vs32,	 0(AO)	// load real,imag from A
	lxvp	vs34,	32(AO)	// load real,imag from A
	lxvp	vs36,	64(AO)	// load real,imag from A
	lxvp	vs38,	96(AO)	// load real,imag from A
	lxvp	vs48,	 0(BO)	// load real imag from B
.endm


.macro KERNEL2x8_2	Index, IsLast
	lxvp	vs40,	DISP16(\Index,128)(AO)	// load real,imag from A
	lxvp	vs42,	DISP16(\Index,160)(AO)	// load real,imag from A
	lxvp	vs44,	DISP16(\Index,192)(AO)	// load real,imag from A
	lxvp	vs46,	DISP16(\Index,224)(AO)	// load real,imag from A
 	lxvp	vs50,	DISP4(\Index,  32)(BO)	// load real,imag from B
	xvf64gerpp	0,	vs32,	vs49
	xvf64gerpp	1,	vs34,	vs49
	xvf64gerpp	2,	vs36,	vs49
	xvf64gerpp	3,	vs38,	vs49
	xvf64gerpp	4,	vs32,	vs48
	xvf64gerpp	5,	vs34,	vs48
	xvf64gerpp	6,	vs36,	vs48
	xvf64gerpp	7,	vs38,	vs48
	lxvp	vs32,	DISP16(\Index, 256)(AO)	// load real,imag from A
	lxvp	vs34,	DISP16(\Index, 288)(AO)	// load real,imag from A
	lxvp	vs36,	DISP16(\Index, 320)(AO)	// load real,imag from A
	lxvp	vs38,	DISP16(\Index, 352)(AO)	// load real,imag from A
	lxvp	vs48,	DISP4(\Index,  64)(BO)	// load real imag from B
	xvf64gerpp	0,	vs40,	vs51
	xvf64gerpp	1,	vs42,	vs51
	xvf64gerpp	2,	vs44,	vs51
	xvf64gerpp	3,	vs46,	vs51
	xvf64gerpp	4,	vs40,	vs50
	xvf64gerpp	5,	vs42,	vs50
	xvf64gerpp	6,	vs44,	vs50
	xvf64gerpp	7,	vs46,	vs50
.if \IsLast==1
	addi	AO, AO,  DISP16(\Index,256)
	addi	BO, BO,  DISP4(\Index,64)
.endif 
.endm


.macro LOAD_END_2x8  OffsetA,OffsetB
	xvf64gerpp	0,	vs32,	vs49
	xvf64gerpp	1,	vs34,	vs49
	xvf64gerpp	2,	vs36,	vs49
	xvf64gerpp	3,	vs38,	vs49
	xvf64gerpp	4,	vs32,	vs48
	xvf64gerpp	5,	vs34,	vs48
	xvf64gerpp	6,	vs36,	vs48
	xvf64gerpp	7,	vs38,	vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL2x8_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
        xxmfacc 1
        xxmfacc 2
        xxmfacc 3
        xxmfacc 4
        xxmfacc 5
        xxmfacc 6
        xxmfacc 7
.endm


.macro SAVE2x8
	add	T1, CO ,LDC 
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10
        xxpermdi vs36, vs4, vs5, 0b01
        xxpermdi vs37, vs4, vs5, 0b10
        xxpermdi vs38, vs6, vs7, 0b01
        xxpermdi vs39, vs6, vs7, 0b10
        xxpermdi vs40, vs8, vs9, 0b01
        xxpermdi vs41, vs8, vs9, 0b10
        xxpermdi vs42, vs10, vs11, 0b01
        xxpermdi vs43, vs10, vs11, 0b10
        xxpermdi vs44, vs12, vs13, 0b01
        xxpermdi vs45, vs12, vs13, 0b10
        xxpermdi vs46, vs14, vs15, 0b01
        xxpermdi vs47, vs14, vs15, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35
	xxlor vs6, vs36, vs36
	xxlor vs7, vs37, vs37
	xxlor vs4, vs38, vs38
	xxlor vs5, vs39, vs39
	xxlor vs10, vs40, vs40
	xxlor vs11, vs41, vs41
	xxlor vs8, vs42, vs42
	xxlor vs9, vs43, vs43
	xxlor vs14, vs44, vs44
	xxlor vs15, vs45, vs45
	xxlor vs12, vs46, vs46
	xxlor vs13, vs47, vs47

        xxpermdi vs32, vs16, vs17, 0b01
        xxpermdi vs33, vs16, vs17, 0b10
        xxpermdi vs34, vs18, vs19, 0b01
        xxpermdi vs35, vs18, vs19, 0b10
        xxpermdi vs36, vs20, vs21, 0b01
        xxpermdi vs37, vs20, vs21, 0b10
        xxpermdi vs38, vs22, vs23, 0b01
        xxpermdi vs39, vs22, vs23, 0b10
        xxpermdi vs40, vs24, vs25, 0b01
        xxpermdi vs41, vs24, vs25, 0b10
        xxpermdi vs42, vs26, vs27, 0b01
        xxpermdi vs43, vs26, vs27, 0b10
        xxpermdi vs44, vs28, vs29, 0b01
        xxpermdi vs45, vs28, vs29, 0b10
        xxpermdi vs46, vs30, vs31, 0b01
        xxpermdi vs47, vs30, vs31, 0b10
       
	xxlor vs18, vs32, vs32
	xxlor vs19, vs33, vs33
	xxlor vs16, vs34, vs34
	xxlor vs17, vs35, vs35
	xxlor vs22, vs36, vs36
	xxlor vs23, vs37, vs37
	xxlor vs20, vs38, vs38
	xxlor vs21, vs39, vs39
	xxlor vs26, vs40, vs40
	xxlor vs27, vs41, vs41
	xxlor vs24, vs42, vs42
	xxlor vs25, vs43, vs43
	xxlor vs30, vs44, vs44
	xxlor vs31, vs45, vs45
	xxlor vs28, vs46, vs46
	xxlor vs29, vs47, vs47

	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
	SAVE8  vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0  
	addi	CO, CO, 128
.endm

/**********************************************************************************************
*

.macros for N=2 and M=4
**********************************************************************************************/

.macro  KERNEL2x4_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
        xxsetaccz 1
        xxsetaccz 2
        xxsetaccz 3
.endm


.macro KERNEL2x4_PRELOAD
	lxvp	vs32,	 0(AO)	// load real,imag from A
	lxvp	vs34,	32(AO)	// load real,imag from A
	lxvp	vs48,	 0(BO)	// load real imag from B
.endm


.macro KERNEL2x4_2 Index, IsLast
	lxvp	vs40,	DISP8(\Index,  64)(AO)	// load real,imag from A
	lxvp	vs42,	DISP8(\Index,  96)(AO)	// load real,imag from A
 	lxvp	vs50,	DISP4(\Index,  32)(BO)  // load real,imag from B
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs34,   vs49
        xvf64gerpp      2,      vs32,   vs48
        xvf64gerpp      3,      vs34,   vs48
	lxvp	vs32,	DISP8(\Index, 128)(AO)	// load real,imag from A
	lxvp	vs34,	DISP8(\Index, 160)(AO)	// load real,imag from A
 	lxvp	vs48,	DISP4(\Index,  64)(BO)  // load real,imag from B
        xvf64gerpp      0,      vs40,   vs51 
        xvf64gerpp      1,      vs42,   vs51
        xvf64gerpp      2,      vs40,   vs50
        xvf64gerpp      3,      vs42,   vs50
.if \IsLast==1
	addi	AO, AO, DISP8(\Index,128)
	addi	BO, BO, DISP4(\Index,64)
.endif 
.endm
 

.macro LOAD_END_2x4	OffsetA, OffsetB
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs34,   vs49
        xvf64gerpp      2,      vs32,   vs48
        xvf64gerpp      3,      vs34,   vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL2x4_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
        xxmfacc 1
        xxmfacc 2
        xxmfacc 3
.endm


.macro SAVE2x4 
	add	T1, CO ,LDC 
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10
        xxpermdi vs36, vs4, vs5, 0b01
        xxpermdi vs37, vs4, vs5, 0b10
        xxpermdi vs38, vs6, vs7, 0b01
        xxpermdi vs39, vs6, vs7, 0b10
        xxpermdi vs40, vs8, vs9, 0b01
        xxpermdi vs41, vs8, vs9, 0b10
        xxpermdi vs42, vs10, vs11, 0b01
        xxpermdi vs43, vs10, vs11, 0b10
        xxpermdi vs44, vs12, vs13, 0b01
        xxpermdi vs45, vs12, vs13, 0b10
        xxpermdi vs46, vs14, vs15, 0b01
        xxpermdi vs47, vs14, vs15, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35
	xxlor vs6, vs36, vs36
	xxlor vs7, vs37, vs37
	xxlor vs4, vs38, vs38
	xxlor vs5, vs39, vs39
	xxlor vs10, vs40, vs40
	xxlor vs11, vs41, vs41
	xxlor vs8, vs42, vs42
	xxlor vs9, vs43, vs43
	xxlor vs14, vs44, vs44
	xxlor vs15, vs45, vs45
	xxlor vs12, vs46, vs46
	xxlor vs13, vs47, vs47

	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
	SAVE4  vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0  
	addi	CO, CO, 64
.endm

/**********************************************************************************************
*

.macros for N=2 and M=2
**********************************************************************************************/

.macro  KERNEL2x2_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
        xxsetaccz 1
.endm


.macro KERNEL2x2_PRELOAD
	lxvp	vs32,	 0(AO)	// load real,imag from A
	lxvp	vs48,	 0(BO)	// load real imag from B
.endm


.macro KERNEL2x2_2 Index, IsLast
	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
 	lxvp	vs50,	DISP4(\Index, 32)(BO)	// load real,imag from B
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs32,   vs48
	lxvp	vs32,	DISP4(\Index, 64)(AO)	// load real,imag from A
	lxvp	vs48,	DISP4(\Index, 64)(BO)	// load real imag from B
        xvf64gerpp      0,      vs40,   vs51
        xvf64gerpp      1,      vs40,   vs50
.if \IsLast==1
	addi	AO, AO, DISP4(\Index,64)
	addi	BO, BO, DISP4(\Index,64)
.endif 
.endm

 
.macro LOAD_END_2x2  OffsetA,OffsetB
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs32,   vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL2x2_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
        xxmfacc 1
.endm


.macro SAVE2x2 
	add	T1, CO ,LDC 
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10
        xxpermdi vs36, vs4, vs5, 0b01
        xxpermdi vs37, vs4, vs5, 0b10
        xxpermdi vs38, vs6, vs7, 0b01
        xxpermdi vs39, vs6, vs7, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35
	xxlor vs6, vs36, vs36
	xxlor vs7, vs37, vs37
	xxlor vs4, vs38, vs38
	xxlor vs5, vs39, vs39

	SAVE2  vs0,vs1,vs2,vs3,CO,0
	SAVE2  vs4,vs5,vs6,vs7,T1,0 
	addi	CO, CO, 32 
.endm

/**********************************************************************************************
*

.macros for N=2 and M=1
**********************************************************************************************/

.macro ZERO2x1
	xxlxor	vs0,	vs0,	vs0
	xxlxor	vs1,	vs1,	vs1
	xxlxor	vs2,	vs2,	vs2
	xxlxor	vs3,	vs3,	vs3
 
.endm


.macro LOAD2x1   
	LOAD2x1O 0,0 
.endm


.macro LOAD2x1O  OffsetA,OffsetB
	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B 
	xxswapd	vs49, vs48
	xxswapd	vs51, vs50
	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
.endm


.macro END2x1_WITHOUT_ADD
	END2x1 AO,BO,0,0
.endm


.macro END2x1	AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
	addi	\BREG, \BREG, \OffsetB
.endif
.if \OffsetA != 0
	addi	\AREG, \AREG, \OffsetA
.endif
	xvmaddadp	vs0,	vs32,	vs48
	xvmaddadp	vs2,	vs32,	vs50
	xvmaddadp	vs1,	vs32,	vs49
	xvmaddadp	vs3,	vs32,	vs51 
.endm


.macro LOAD2x1_2
    LOAD2x1_2O 0,0
.endm	


.macro LOAD2x1_2O  OffsetA,OffsetB
	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
	lxv	vs50,	(\OffsetB+16)(BO)	// load real,imag from B
	lxv	vs52,	(\OffsetB+32)(BO)	// load real,imag	from B
	lxv	vs54,	(\OffsetB+48)(BO)	// load real,imag  from B	
	xxswapd	vs49, vs48
	xxswapd	vs51, vs50
	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
.endm	


.macro END2x1_2	  
  /*for load2 offset will be 32 and 64*/
   KERNEL2x1_2	AO,BO,	32,64,0 ,1,1 
.endm


.macro KERNEL2x1_E2	OffsetA,OffsetB, Index,IsLast 
  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
.endm


.macro KERNEL2x1_L2	OffsetA,OffsetB, Index,IsLast
  KERNEL2x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
.endm


.macro KERNEL2x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
  xxswapd	vs53, vs52
  xxswapd	vs55, vs54 
	xvmaddadp	vs0,	vs32,	vs48
	xvmaddadp	vs2,	vs32,	vs50
	xvmaddadp	vs1,	vs32,	vs49
	xvmaddadp	vs3,	vs32,	vs51
.if \Complete==0	
	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
.endif	 
.if \Complete==0		
	lxv	vs48,	DISP4(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
	lxv	vs50,	DISP4(\Index, 16+\OffsetB)(\BREG)	// load real,imag from B
.endif
.if \Complete==0		
  xxswapd	vs49, vs48
  xxswapd	vs51, vs50
.endif 
	xvmaddadp	vs0,	vs40,	vs52
	xvmaddadp	vs2,	vs40,	vs54 
	xvmaddadp	vs1,	vs40,	vs53
	xvmaddadp	vs3,	vs40,	vs55
.if \Complete==0		
	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
.endif
 
.if \Complete==0	 
 	lxv	vs52,	DISP4(\Index, 32+\OffsetB)(\BREG)	// load real,imag	from B
	lxv	vs54,	DISP4(\Index, 48+\OffsetB)(\BREG)	// load real,imag  from B
.endif
.if \IsLast==1
.if \Complete==1
	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
	addi	\BREG, \BREG,  DISP4(\Index,\OffsetB)
.else
	addi	\AREG, \AREG, DISP2(\Index,32)
	addi	\BREG, \BREG,  DISP4(\Index,64)
.endif
.endif 
.endm
 

.macro KERNEL2x1
  LOAD2x1
  END2x1  AO, BO, 16,32
.endm


.macro SAVE2x1
	add	T1, CO ,LDC 
	SAVE1  vs0,vs1,CO,0
	SAVE1  vs2,vs3,T1,0  
	addi	CO, CO, 16 
.endm

/**********************************************************************************************
*

.macros for N=1 and M=8
**********************************************************************************************/

.macro  KERNEL1x8_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
        xxsetaccz 1
        xxsetaccz 2
        xxsetaccz 3
.endm


.macro KERNEL1x8_2	Index,IsLast
	lxvp	vs32,	DISP16(\Index,   0)(AO)	// load real,imag from A
	lxvp	vs34,	DISP16(\Index,  32)(AO)	// load real,imag from A
	lxvp	vs36,	DISP16(\Index,  64)(AO)	// load real,imag from A
	lxvp	vs38,	DISP16(\Index,  96)(AO)	// load real,imag from A
	lxvp	vs40,	DISP16(\Index, 128)(AO)	// load real,imag from A
	lxvp	vs42,	DISP16(\Index, 160)(AO)	// load real,imag from A
	lxvp	vs44,	DISP16(\Index, 192)(AO)	// load real,imag from A
	lxvp	vs46,	DISP16(\Index, 224)(AO)	// load real,imag from A
	lxvp	vs48,	DISP2(\Index,    0)(BO)	// load real imag from B
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs34,   vs49
        xvf64gerpp      2,      vs36,   vs49
        xvf64gerpp      3,      vs38,   vs49
        xvf64gerpp      0,      vs40,   vs48
        xvf64gerpp      1,      vs42,   vs48
        xvf64gerpp      2,      vs44,   vs48
        xvf64gerpp      3,      vs46,   vs48
.if \IsLast==1
	addi	AO, AO, DISP16(\Index,256)
	addi	BO, BO,  DISP2(\Index,32)
.endif 
.endm


.macro LOAD_END_1x8  OffsetA,OffsetB
	lxvp	vs32,	0(AO)	// load real,imag from A
	lxvp	vs34,	32(AO)	// load real,imag from A
	lxvp	vs36,	64(AO)	// load real,imag from A
	lxvp	vs38,	96(AO)	// load real,imag from A
	lxv	vs48,	0(BO)	// load real imag from B 
        xvf64gerpp      0,      vs32,   vs48
        xvf64gerpp      1,      vs34,   vs48
        xvf64gerpp      2,      vs36,   vs48
        xvf64gerpp      3,      vs38,   vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL1x8_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
        xxmfacc 1
        xxmfacc 2
        xxmfacc 3
.endm


.macro SAVE1x8
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10
        xxpermdi vs36, vs4, vs5, 0b01
        xxpermdi vs37, vs4, vs5, 0b10
        xxpermdi vs38, vs6, vs7, 0b01
        xxpermdi vs39, vs6, vs7, 0b10
        xxpermdi vs40, vs8, vs9, 0b01
        xxpermdi vs41, vs8, vs9, 0b10
        xxpermdi vs42, vs10, vs11, 0b01
        xxpermdi vs43, vs10, vs11, 0b10
        xxpermdi vs44, vs12, vs13, 0b01
        xxpermdi vs45, vs12, vs13, 0b10
        xxpermdi vs46, vs14, vs15, 0b01
        xxpermdi vs47, vs14, vs15, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35
	xxlor vs6, vs36, vs36
	xxlor vs7, vs37, vs37
	xxlor vs4, vs38, vs38
	xxlor vs5, vs39, vs39
	xxlor vs10, vs40, vs40
	xxlor vs11, vs41, vs41
	xxlor vs8, vs42, vs42
	xxlor vs9, vs43, vs43
	xxlor vs14, vs44, vs44
	xxlor vs15, vs45, vs45
	xxlor vs12, vs46, vs46
	xxlor vs13, vs47, vs47

	SAVE8  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
	addi	CO, CO, 128
.endm

/**********************************************************************************************
*

.macros for N=1 and M=4
**********************************************************************************************/

.macro  KERNEL1x4_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
        xxsetaccz 1
.endm


.macro KERNEL1x4_2	Index,IsLast
	lxvp	vs32,	DISP8(\Index,  0)(AO)	// load real,imag from A
	lxvp	vs34,	DISP8(\Index, 32)(AO)	// load real,imag from A
	lxvp	vs40,	DISP8(\Index, 64)(AO)	// load real,imag from A
	lxvp	vs42,	DISP8(\Index, 96)(AO)	// load real,imag from A
	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      1,      vs34,   vs49
        xvf64gerpp      0,      vs40,   vs48
        xvf64gerpp      1,      vs42,   vs48
.if \IsLast==1
	addi	AO, AO, DISP8(\Index,128)
	addi	BO, BO,  DISP2(\Index,32)
.endif 
.endm
 

.macro LOAD_END_1x4  OffsetA,OffsetB
	lxvp	vs32,	0(AO)	// load real,imag from A
	lxvp	vs34,	32(AO)	// load real,imag from A
	lxv	vs48,	0(BO)	// load real imag from B
        xvf64gerpp      0,      vs32,   vs48
        xvf64gerpp      1,      vs34,   vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL1x4_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
        xxmfacc 1
.endm


.macro SAVE1x4 
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10
        xxpermdi vs36, vs4, vs5, 0b01
        xxpermdi vs37, vs4, vs5, 0b10
        xxpermdi vs38, vs6, vs7, 0b01
        xxpermdi vs39, vs6, vs7, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35
	xxlor vs6, vs36, vs36
	xxlor vs7, vs37, vs37
	xxlor vs4, vs38, vs38
	xxlor vs5, vs39, vs39

	SAVE4  vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
	addi	CO, CO, 64
.endm

/**********************************************************************************************
*

.macros for N=1 and M=2
**********************************************************************************************/

.macro  KERNEL1x2_ZERO_AND_PRIME_MMA
        /* zero out and prime the MMA accumulators */
        xxsetaccz 0
.endm


.macro KERNEL1x2_2	Index,IsLast
	lxvp	vs32,	DISP4(\Index,  0)(AO)	// load real,imag from A
	lxvp	vs40,	DISP4(\Index, 32)(AO)	// load real,imag from A
	lxvp	vs48,	DISP2(\Index,  0)(BO)	// load real imag from B
        xvf64gerpp      0,      vs32,   vs49
        xvf64gerpp      0,      vs40,   vs48
.if \IsLast==1
	addi	AO, AO, DISP4(\Index,64)
	addi	BO, BO, DISP2(\Index,32)
.endif 
.endm
 

.macro LOAD_END_1x2  OffsetA,OffsetB
	lxvp	vs32,	0(AO)	// load real,imag from A
	lxv	vs48,	0(BO)	// load real imag from B
        xvf64gerpp      0,      vs32,   vs48
	addi	BO, BO, \OffsetB
	addi	AO, AO, \OffsetA
.endm


.macro  KERNEL1x2_UNPRIME_MMA
        /* "unprime" MMA accumulators */
        xxmfacc 0
.endm


.macro SAVE1x2 
        xxpermdi vs32, vs0, vs1, 0b01
        xxpermdi vs33, vs0, vs1, 0b10
        xxpermdi vs34, vs2, vs3, 0b01
        xxpermdi vs35, vs2, vs3, 0b10

	xxlor vs2, vs32, vs32
	xxlor vs3, vs33, vs33
	xxlor vs0, vs34, vs34
	xxlor vs1, vs35, vs35

	SAVE2  vs0,vs1,vs2,vs3,CO,0
	addi	CO, CO, 32 
.endm

/**********************************************************************************************
*

.macros for N=1 and M=1
**********************************************************************************************/

.macro ZERO1x1
	xxlxor	vs0,	vs0,	vs0
	xxlxor	vs1,	vs1,	vs1 
.endm


.macro LOAD1x1   
	LOAD1x1O 0,0 
.endm


.macro LOAD1x1O  OffsetA,OffsetB
	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A 
	xxswapd	vs49, vs48

.endm


.macro END1x1_WITHOUT_ADD
	END1x1 AO,BO,0,0
.endm


.macro END1x1	AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
	addi	\BREG, \BREG, \OffsetB
.endif
.if \OffsetA != 0
	addi	\AREG, \AREG, \OffsetA
.endif
	xvmaddadp	vs0,	vs32,	vs48 
	xvmaddadp	vs1,	vs32,	vs49 
.endm


.macro LOAD1x1_2
    LOAD1x1_2O 0,0
.endm	


.macro LOAD1x1_2O  OffsetA,OffsetB
	lxv	vs48,(\OffsetB+	0)(BO)	// load real imag from B
	lxv	vs52,	(\OffsetB+16)(BO)	// load real,imag	from B
	xxswapd	vs49, vs48

	lxv	vs32,	(0+\OffsetA)(AO)	// load real,imag from A
	lxv	vs40,	(16+\OffsetA)(AO)	// load real,imag from A 
.endm	


.macro END1x1_2	  
  /*for load2 offset will be 32 and 32*/
   KERNEL1x1_2	AO,BO,	32,32,0 ,1,1 
.endm
 


.macro KERNEL1x1_E2	OffsetA,OffsetB, Index,IsLast 
  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,1 
.endm


.macro KERNEL1x1_L2	OffsetA,OffsetB, Index,IsLast
  KERNEL1x1_2	AO,BO,	\OffsetA,\OffsetB, \Index,\IsLast ,0 
.endm


.macro KERNEL1x1_2	AREG,BREG,	OffsetA,OffsetB, Index,IsLast ,Complete
  xxswapd	vs53, vs52
	xvmaddadp	vs0,	vs32,	vs48 
	xvmaddadp	vs1,	vs32,	vs49 
.if \Complete==0	
	lxv	vs32,	DISP2(\Index, 0 + \OffsetA)(\AREG)	// load real,imag from A 
.endif	 
.if \Complete==0		
	lxv	vs48,	DISP2(\Index, 0+\OffsetB)(\BREG)	// load real imag from B
.endif
.if \Complete==0		
  xxswapd	vs49, vs48
.endif 
	xvmaddadp	vs0,	vs40,	vs52
	xvmaddadp	vs1,	vs40,	vs53 
.if \Complete==0		
	lxv	vs40,	DISP2(\Index,16+0+ \OffsetA)(\AREG)	// load real,imag from A 
.endif
 
.if \Complete==0	 
 	lxv	vs52,	DISP2(\Index, 16+\OffsetB)(\BREG)	// load real,imag	from B
.endif
.if \IsLast==1
.if \Complete==1
	addi	\AREG, \AREG,  DISP2(\Index,\OffsetA)
	addi	\BREG, \BREG,  DISP2(\Index,\OffsetB)
.else
	addi	\AREG, \AREG, DISP2(\Index,32)
	addi	\BREG, \BREG,  DISP2(\Index,32)
.endif
.endif 
.endm
 


.macro KERNEL1x1
  LOAD1x1
  END1x1  AO, BO, 16,16
.endm



.macro SAVE1x1
	SAVE1  vs0,vs1,CO,0
	addi	CO, CO, 16 
.endm

/****************************TRMM POINTER REFRESH

.macroSES*************************/


.macro SHIFT_REG  REG1,REG2,SHIFT_VAL
		.if \SHIFT_VAL==16 
			slwi		\REG1,	\REG2,	8			
		.elseif \SHIFT_VAL==8  
			slwi		\REG1,	\REG2,	7			 
		.elseif \SHIFT_VAL==4
			slwi		\REG1,	\REG2,	6			  
		.elseif \SHIFT_VAL==2
			slwi		\REG1,	\REG2,	5			 
		.elseif \SHIFT_VAL==1
			slwi		\REG1,	\REG2,	4			 
		.endif
.endm
/*
//#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
// 		ptrbb = bb;
// #else
// 		ptrba += off*16;
// 		ptrbb = bb + off*2;
// #endif
*/


.macro REFRESH_POINTERS  PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
    #if (defined(LEFT) &&  defined(TRANSA)) ||  (!defined(LEFT) && !defined(TRANSA))
        /* ptrbb = bb;*/
        mr \PTR_B,\B_VAL     /* refresh BPOINT */
    #else
		    /*
        // ptrba  =ptrba+ off*C_A;
        // ptrbb = bb + off*C_B; 
				*/
		SHIFT_REG T4,\OFF_VAL,\C_B		/* Number of values in B shifted  */
		SHIFT_REG T2,\OFF_VAL,\C_A		/* Number of values in A shifted  */
		add		\PTR_B,	\B_VAL ,	T4				/* Add values to BO */
		add		\PTR_A,	\PTR_A,	T2				/* Add values to AO  */
    #endif 
.endm

/*
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
// 		temp = bk-off;
// #elif defined(LEFT)
// 		temp = off+16;	// number of values in A
// #else
// 		temp = off+2;	// number of values in B
// #endif
*/


.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
    #if (defined(LEFT) && !defined(TRANSA)) ||  (!defined(LEFT) && defined(TRANSA))
                            /* temp = bk-off;*/
           sub \TEMP_BK,\BK_VAL,\OFF_VAL
    #elif defined(LEFT)
                            /* temp = off+INCR_A;	// number of values in A */
           addi \TEMP_BK, \OFF_VAL, \INCR_A
    #else
                            /* temp = off+INCR_B	// number of values in B*/
           addi \TEMP_BK,\OFF_VAL, \INCR_B
    #endif
.endm
/*
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
// 		temp = bk - off;
// #ifdef LEFT
// 		temp -= 16; // number of values in A
// #else
// 		temp -= 2; // number of values in B
// #endif
// 		ptrba += temp*16;
// 		ptrbb += temp*2;
// #endif
// #ifdef LEFT
// 		off += 16; // number of values in A
// #endif
*/
 


.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
    #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
                    /*temp = bk - off;*/
                sub \TEMP_BK,\BK_VAL,\OFF_VAL
    #ifdef LEFT
                    /*temp -= 8; // number of values in A*/
                addi \TEMP_BK,\TEMP_BK,-\C_A
    #else
                    /*temp -= 4; // number of values in B*/
                addi \TEMP_BK,\TEMP_BK,-\C_B 
    #endif
                    /*ptrba += temp*C_A;
                    ptrbb += temp*C_B;*/ 
                SHIFT_REG T4,\TEMP_BK,\C_A
								SHIFT_REG T2,\TEMP_BK,\C_B
                add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ 
								add \PTR_B, \PTR_B,T2 
    #endif
    #ifdef LEFT
                    /*off += 8; // number of values in A*/
                 addi \OFF_VAL,\OFF_VAL,\C_A
    #endif
.endm

